IMDb text data

import pyprind
import pandas as pd
import os
basepath='aclImdb'
labels={'pos':1, 'neg':0}
pbar=pyprind.ProgBar(50000)
df=pd.DataFrame()
for s in('test', 'train'):
for l in ('pos', 'neg'):
path=os.path.join(basepath, s, l)
for file in sorted(os.listdir(path)):
with open(os.path.join(path, file), 'r', encoding='utf-8') as infile:
txt=infile.read()
df=df.append([[txt, labels[l]]], ignore_index=True)
pbar.update()
df.columns=['review', 'sentiment']
# CSV
import numpy as np
np.random.seed(0)
df=df.reindex(np.random.permutation(df.index))
df.to_csv('movie_data.csv', index=False, encoding='utf-8')
using CSV
df=pd.read_csv('movie_data.csv', encoding='utf-8')
# data preprocess
import re
def preprocessor(text):
text=re.sub('<[^>]*', '' , text) #HTML
emoticons=re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
text=(re.sub('[\W]+', ' ', text.lower())+' '.join(emoticons).replace('-', ''))
return text
df['review']=df['review'].apply(preprocessor)
#split data
X_train=df.loc[:25000, 'review'].values
y_train=df.loc[:25000, 'sentiment'].values
X_test=df.loc[25000:, 'review'].values
y_test=df.loc[25000:, 'sentiment'].values